{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Processing\n",
    "\n",
    "## Capturing Text Data\n",
    "\n",
    "### Plain Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Read in a plain text file\n",
    "with open(os.path.join(\"data\", \"hieroglyph.txt\"), \"r\") as f:\n",
    "    text = f.read()\n",
    "    print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tabular Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Extract text column from a dataframe\n",
    "df = pd.read_csv(os.path.join(\"data\", \"news.csv\"))\n",
    "df.head()[['publisher', 'title']]\n",
    "\n",
    "# Convert text column to lowercase\n",
    "df['title'] = df['title'].str.lower()\n",
    "df.head()[['publisher', 'title']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Online Resource"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "\n",
    "# Fetch data from a REST API\n",
    "r = requests.get(\n",
    "    \"https://quotes.rest/qod.json\")\n",
    "res = r.json()\n",
    "print(json.dumps(res, indent=4))\n",
    "\n",
    "# Extract relevant object and field\n",
    "q = res[\"contents\"][\"quotes\"][0]\n",
    "print(q[\"quote\"], \"\\n--\", q[\"author\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html op=\"news\"><head><meta name=\"referrer\" content=\"origin\"><meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"><link rel=\"stylesheet\" type=\"text/css\" href=\"news.css?xDJZ1aWhiD4MZBrpGsuq\">\n",
      "            <link rel=\"shortcut icon\" href=\"favicon.ico\">\n",
      "          <link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS\" href=\"rss\">\n",
      "        <title>Hacker News</title></head><body><center><table id=\"hnmain\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\" width=\"85%\" bgcolor=\"#f6f6ef\">\n",
      "        <tr><td bgcolor=\"#ff6600\"><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" width=\"100%\" style=\"padding:2px\"><tr><td style=\"width:18px;padding-right:4px\"><a href=\"https://news.ycombinator.com\"><img src=\"y18.gif\" width=\"18\" height=\"18\" style=\"border:1px white solid;\"></a></td>\n",
      "                  <td style=\"line-height:12pt; height:10px;\"><span class=\"pagetop\"><b class=\"hnname\"><a href=\"news\">Hacker News</a></b>\n",
      "              <a href=\"newest\">new</a> | <a href=\"front\">past</a> | <a href=\"newcomments\">comments</a> | <a href=\"ask\">ask</a> | <a href=\"show\">show</a> | <a href=\"jobs\">jobs</a> | <a href=\"submit\">submit</a>            </span></td><td style=\"text-align:right;padding-right:4px;\"><span class=\"pagetop\">\n",
      "                              <a href=\"login?goto=news\">login</a>\n",
      "                          </span></td>\n",
      "              </tr></table></td></tr>\n",
      "<tr id=\"pagespace\" title=\"\" style=\"height:10px\"></tr><tr><td><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" class=\"itemlist\">\n",
      "              <tr class='athing' id='20044876'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">1.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20044876' href='vote?id=20044876&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://courses.csail.mit.edu/6.851/fall17/\" class=\"storylink\">Advanced Data Structures</a><span class=\"sitebit comhead\"> (<a href=\"from?site=mit.edu\"><span class=\"sitestr\">mit.edu</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20044876\">315 points</span> by <a href=\"user?id=rjammala\" class=\"hnuser\">rjammala</a> <span class=\"age\"><a href=\"item?id=20044876\">2 hours ago</a></span> <span id=\"unv_20044876\"></span> | <a href=\"hide?id=20044876&amp;goto=news\">hide</a> | <a href=\"item?id=20044876\">27&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20044430'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">2.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20044430' href='vote?id=20044430&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://9to5google.com/2019/05/29/chrome-ad-blocking-enterprise-manifest-v3/\" class=\"storylink\">Google to restrict modern ad blocking Chrome extensions to enterprise users</a><span class=\"sitebit comhead\"> (<a href=\"from?site=9to5google.com\"><span class=\"sitestr\">9to5google.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20044430\">221 points</span> by <a href=\"user?id=estranhosidade\" class=\"hnuser\">estranhosidade</a> <span class=\"age\"><a href=\"item?id=20044430\">3 hours ago</a></span> <span id=\"unv_20044430\"></span> | <a href=\"hide?id=20044430&amp;goto=news\">hide</a> | <a href=\"item?id=20044430\">119&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20045742'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">3.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20045742' href='vote?id=20045742&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://gavv.github.io/articles/roc-0.1/\" class=\"storylink\">Roc – Real-Time streaming over the network</a><span class=\"sitebit comhead\"> (<a href=\"from?site=gavv.github.io\"><span class=\"sitestr\">gavv.github.io</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20045742\">33 points</span> by <a href=\"user?id=gavv42\" class=\"hnuser\">gavv42</a> <span class=\"age\"><a href=\"item?id=20045742\">1 hour ago</a></span> <span id=\"unv_20045742\"></span> | <a href=\"hide?id=20045742&amp;goto=news\">hide</a> | <a href=\"item?id=20045742\">9&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20045380'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">4.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20045380' href='vote?id=20045380&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://dothemath.ucsd.edu/2012/04/economist-meets-physicist\" class=\"storylink\">Exponential economist meets finite physicist</a><span class=\"sitebit comhead\"> (<a href=\"from?site=ucsd.edu\"><span class=\"sitestr\">ucsd.edu</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20045380\">42 points</span> by <a href=\"user?id=chepaslaaa\" class=\"hnuser\">chepaslaaa</a> <span class=\"age\"><a href=\"item?id=20045380\">2 hours ago</a></span> <span id=\"unv_20045380\"></span> | <a href=\"hide?id=20045380&amp;goto=news\">hide</a> | <a href=\"item?id=20045380\">26&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20042355'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">5.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20042355' href='vote?id=20042355&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://0x.org/launch-kit\" class=\"storylink\">0x Launch Kit – Launch your own cryptocurrency exchange or marketplace</a><span class=\"sitebit comhead\"> (<a href=\"from?site=0x.org\"><span class=\"sitestr\">0x.org</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20042355\">296 points</span> by <a href=\"user?id=tomhschmidt\" class=\"hnuser\">tomhschmidt</a> <span class=\"age\"><a href=\"item?id=20042355\">6 hours ago</a></span> <span id=\"unv_20042355\"></span> | <a href=\"hide?id=20042355&amp;goto=news\">hide</a> | <a href=\"item?id=20042355\">115&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20039891'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">6.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20039891' href='vote?id=20039891&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://opensourceconnections.com/blog/2019/05/29/falsehoods-programmers-believe-about-search/\" class=\"storylink\">Falsehoods Programmers Believe About Search</a><span class=\"sitebit comhead\"> (<a href=\"from?site=opensourceconnections.com\"><span class=\"sitestr\">opensourceconnections.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20039891\">228 points</span> by <a href=\"user?id=binarymax\" class=\"hnuser\">binarymax</a> <span class=\"age\"><a href=\"item?id=20039891\">9 hours ago</a></span> <span id=\"unv_20039891\"></span> | <a href=\"hide?id=20039891&amp;goto=news\">hide</a> | <a href=\"item?id=20039891\">138&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20043410'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">7.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20043410' href='vote?id=20043410&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.citylab.com/transportation/2019/05/elon-musk-tunnel-las-vegas-loop-boring-company-electric-cars/590287/\" class=\"storylink\">The Boring Company will develop an underground “people mover” for Las Vegas</a><span class=\"sitebit comhead\"> (<a href=\"from?site=citylab.com\"><span class=\"sitestr\">citylab.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20043410\">150 points</span> by <a href=\"user?id=cienega\" class=\"hnuser\">cienega</a> <span class=\"age\"><a href=\"item?id=20043410\">5 hours ago</a></span> <span id=\"unv_20043410\"></span> | <a href=\"hide?id=20043410&amp;goto=news\">hide</a> | <a href=\"item?id=20043410\">259&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20039980'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">8.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20039980' href='vote?id=20039980&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://theconversation.com/complex-life-may-only-exist-because-of-millions-of-years-of-groundwork-by-ancient-fungi-117526\" class=\"storylink\">Complex life may only exist because of millions of years of groundwork by fungi</a><span class=\"sitebit comhead\"> (<a href=\"from?site=theconversation.com\"><span class=\"sitestr\">theconversation.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20039980\">131 points</span> by <a href=\"user?id=pradpk\" class=\"hnuser\">pradpk</a> <span class=\"age\"><a href=\"item?id=20039980\">9 hours ago</a></span> <span id=\"unv_20039980\"></span> | <a href=\"hide?id=20039980&amp;goto=news\">hide</a> | <a href=\"item?id=20039980\">42&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20038619'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">9.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20038619' href='vote?id=20038619&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://solar.lowtechmagazine.com/power.html\" class=\"storylink\">A website that runs on a solar-powered server in Barcelona</a><span class=\"sitebit comhead\"> (<a href=\"from?site=lowtechmagazine.com\"><span class=\"sitestr\">lowtechmagazine.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20038619\">279 points</span> by <a href=\"user?id=peey\" class=\"hnuser\">peey</a> <span class=\"age\"><a href=\"item?id=20038619\">12 hours ago</a></span> <span id=\"unv_20038619\"></span> | <a href=\"hide?id=20038619&amp;goto=news\">hide</a> | <a href=\"item?id=20038619\">86&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20041076'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">10.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20041076' href='vote?id=20041076&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://pugsql.org\" class=\"storylink\">Show HN: PugSQL, a Python Port of HugSQL</a><span class=\"sitebit comhead\"> (<a href=\"from?site=pugsql.org\"><span class=\"sitestr\">pugsql.org</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20041076\">114 points</span> by <a href=\"user?id=mcfunley\" class=\"hnuser\">mcfunley</a> <span class=\"age\"><a href=\"item?id=20041076\">8 hours ago</a></span> <span id=\"unv_20041076\"></span> | <a href=\"hide?id=20041076&amp;goto=news\">hide</a> | <a href=\"item?id=20041076\">39&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20040779'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">11.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20040779' href='vote?id=20040779&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://lwn.net/SubscriberLink/789600/101b40d06e0dfb80/\" class=\"storylink\">A way to do atomic writes</a><span class=\"sitebit comhead\"> (<a href=\"from?site=lwn.net\"><span class=\"sitestr\">lwn.net</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20040779\">89 points</span> by <a href=\"user?id=Tomte\" class=\"hnuser\">Tomte</a> <span class=\"age\"><a href=\"item?id=20040779\">8 hours ago</a></span> <span id=\"unv_20040779\"></span> | <a href=\"hide?id=20040779&amp;goto=news\">hide</a> | <a href=\"item?id=20040779\">34&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20038959'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">12.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20038959' href='vote?id=20038959&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://phys.org/news/2019-05-forbidden-planet-neptunian.html\" class=\"storylink\">An exoplanet has been found in the ‘Neptunian Desert’</a><span class=\"sitebit comhead\"> (<a href=\"from?site=phys.org\"><span class=\"sitestr\">phys.org</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20038959\">130 points</span> by <a href=\"user?id=lelf\" class=\"hnuser\">lelf</a> <span class=\"age\"><a href=\"item?id=20038959\">10 hours ago</a></span> <span id=\"unv_20038959\"></span> | <a href=\"hide?id=20038959&amp;goto=news\">hide</a> | <a href=\"item?id=20038959\">54&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20045309'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">13.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20045309' href='vote?id=20045309&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://boingboing.net/2019/05/29/hoarding-software-freedom.html\" class=\"storylink\">DRM enabled Google to have an open source browser still under its control</a><span class=\"sitebit comhead\"> (<a href=\"from?site=boingboing.net\"><span class=\"sitestr\">boingboing.net</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20045309\">67 points</span> by <a href=\"user?id=phowat\" class=\"hnuser\">phowat</a> <span class=\"age\"><a href=\"item?id=20045309\">2 hours ago</a></span> <span id=\"unv_20045309\"></span> | <a href=\"hide?id=20045309&amp;goto=news\">hide</a> | <a href=\"item?id=20045309\">37&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20039402'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">14.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20039402' href='vote?id=20039402&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.quantamagazine.org/whats-the-magic-behind-graphenes-magic-angle-20190528/\" class=\"storylink\">What’s the Magic Behind Graphene’s ‘Magic’ Angle?</a><span class=\"sitebit comhead\"> (<a href=\"from?site=quantamagazine.org\"><span class=\"sitestr\">quantamagazine.org</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20039402\">128 points</span> by <a href=\"user?id=pseudolus\" class=\"hnuser\">pseudolus</a> <span class=\"age\"><a href=\"item?id=20039402\">10 hours ago</a></span> <span id=\"unv_20039402\"></span> | <a href=\"hide?id=20039402&amp;goto=news\">hide</a> | <a href=\"item?id=20039402\">16&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20040002'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">15.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20040002' href='vote?id=20040002&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://code.fb.com/security/service-encryption/\" class=\"storylink\">Building Facebook's Service Encryption Infastructure</a><span class=\"sitebit comhead\"> (<a href=\"from?site=fb.com\"><span class=\"sitestr\">fb.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20040002\">129 points</span> by <a href=\"user?id=sudoyear123\" class=\"hnuser\">sudoyear123</a> <span class=\"age\"><a href=\"item?id=20040002\">9 hours ago</a></span> <span id=\"unv_20040002\"></span> | <a href=\"hide?id=20040002&amp;goto=news\">hide</a> | <a href=\"item?id=20040002\">36&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20040485'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">16.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20040485' href='vote?id=20040485&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://developer.salesforce.com/blogs/2019/05/introducing-lightning-web-components-open-source.html\" class=\"storylink\">Open-sourcing the Lightning Web Components framework</a><span class=\"sitebit comhead\"> (<a href=\"from?site=salesforce.com\"><span class=\"sitestr\">salesforce.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20040485\">113 points</span> by <a href=\"user?id=kungfudoi\" class=\"hnuser\">kungfudoi</a> <span class=\"age\"><a href=\"item?id=20040485\">8 hours ago</a></span> <span id=\"unv_20040485\"></span> | <a href=\"hide?id=20040485&amp;goto=news\">hide</a> | <a href=\"item?id=20040485\">65&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20043310'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">17.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20043310' href='vote?id=20043310&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://krebsonsecurity.com/2019/05/should-failing-phish-tests-be-a-fireable-offense/\" class=\"storylink\">Should Failing Phish Tests Be a Fireable Offense?</a><span class=\"sitebit comhead\"> (<a href=\"from?site=krebsonsecurity.com\"><span class=\"sitestr\">krebsonsecurity.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20043310\">99 points</span> by <a href=\"user?id=headalgorithm\" class=\"hnuser\">headalgorithm</a> <span class=\"age\"><a href=\"item?id=20043310\">5 hours ago</a></span> <span id=\"unv_20043310\"></span> | <a href=\"hide?id=20043310&amp;goto=news\">hide</a> | <a href=\"item?id=20043310\">212&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20045794'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">18.</span></td>      <td></td><td class=\"title\"><a href=\"https://keyvalues.com/sparkswap\" class=\"storylink\" rel=\"nofollow\">Hack on Bitcoin in SF at Sparkswap (YC S18)</a><span class=\"sitebit comhead\"> (<a href=\"from?site=keyvalues.com\"><span class=\"sitestr\">keyvalues.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"age\"><a href=\"item?id=20045794\">1 hour ago</a></span> | <a href=\"hide?id=20045794&amp;goto=news\">hide</a>      </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20041384'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">19.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20041384' href='vote?id=20041384&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://avc.com/2019/05/outschooling/\" class=\"storylink\">Outschooling</a><span class=\"sitebit comhead\"> (<a href=\"from?site=avc.com\"><span class=\"sitestr\">avc.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20041384\">78 points</span> by <a href=\"user?id=ctulek\" class=\"hnuser\">ctulek</a> <span class=\"age\"><a href=\"item?id=20041384\">7 hours ago</a></span> <span id=\"unv_20041384\"></span> | <a href=\"hide?id=20041384&amp;goto=news\">hide</a> | <a href=\"item?id=20041384\">99&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20042842'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">20.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20042842' href='vote?id=20042842&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://devblogs.microsoft.com/typescript/announcing-typescript-3-5\" class=\"storylink\">TypeScript 3.5</a><span class=\"sitebit comhead\"> (<a href=\"from?site=microsoft.com\"><span class=\"sitestr\">microsoft.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20042842\">196 points</span> by <a href=\"user?id=DanRosenwasser\" class=\"hnuser\">DanRosenwasser</a> <span class=\"age\"><a href=\"item?id=20042842\">5 hours ago</a></span> <span id=\"unv_20042842\"></span> | <a href=\"hide?id=20042842&amp;goto=news\">hide</a> | <a href=\"item?id=20042842\">43&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20040182'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">21.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20040182' href='vote?id=20040182&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://blog.plaid.com/plaid-in-the-uk/\" class=\"storylink\">Plaid Launches in the UK</a><span class=\"sitebit comhead\"> (<a href=\"from?site=plaid.com\"><span class=\"sitestr\">plaid.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20040182\">129 points</span> by <a href=\"user?id=jessedhillon\" class=\"hnuser\">jessedhillon</a> <span class=\"age\"><a href=\"item?id=20040182\">9 hours ago</a></span> <span id=\"unv_20040182\"></span> | <a href=\"hide?id=20040182&amp;goto=news\">hide</a> | <a href=\"item?id=20040182\">67&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20041321'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">22.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20041321' href='vote?id=20041321&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://techcrunch.com/2019/05/29/uber-will-start-deactivating-riders-with-low-ratings/\" class=\"storylink\">Uber will start deactivating riders with low ratings</a><span class=\"sitebit comhead\"> (<a href=\"from?site=techcrunch.com\"><span class=\"sitestr\">techcrunch.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20041321\">188 points</span> by <a href=\"user?id=hbcondo714\" class=\"hnuser\">hbcondo714</a> <span class=\"age\"><a href=\"item?id=20041321\">7 hours ago</a></span> <span id=\"unv_20041321\"></span> | <a href=\"hide?id=20041321&amp;goto=news\">hide</a> | <a href=\"item?id=20041321\">487&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20040868'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">23.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20040868' href='vote?id=20040868&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.apple.com/ios/app-store/principles-practices/\" class=\"storylink\">Apple App Store Principles and Practices</a><span class=\"sitebit comhead\"> (<a href=\"from?site=apple.com\"><span class=\"sitestr\">apple.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20040868\">136 points</span> by <a href=\"user?id=tosh\" class=\"hnuser\">tosh</a> <span class=\"age\"><a href=\"item?id=20040868\">8 hours ago</a></span> <span id=\"unv_20040868\"></span> | <a href=\"hide?id=20040868&amp;goto=news\">hide</a> | <a href=\"item?id=20040868\">233&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20039314'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">24.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20039314' href='vote?id=20039314&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.bloomberg.com/news/articles/2019-05-29/long-before-crash-ethiopian-air-pilot-warned-bosses-of-dangers\" class=\"storylink\">Long Before Boeing 737 Max Crash, Ethiopian Air Pilot Warned of Dangers</a><span class=\"sitebit comhead\"> (<a href=\"from?site=bloomberg.com\"><span class=\"sitestr\">bloomberg.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20039314\">176 points</span> by <a href=\"user?id=pseudolus\" class=\"hnuser\">pseudolus</a> <span class=\"age\"><a href=\"item?id=20039314\">10 hours ago</a></span> <span id=\"unv_20039314\"></span> | <a href=\"hide?id=20039314&amp;goto=news\">hide</a> | <a href=\"item?id=20039314\">108&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20038852'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">25.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20038852' href='vote?id=20038852&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://github.com/angular/angular-cli/releases\" class=\"storylink\">Angular v8.0</a><span class=\"sitebit comhead\"> (<a href=\"from?site=github.com\"><span class=\"sitestr\">github.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20038852\">241 points</span> by <a href=\"user?id=tashoecraft\" class=\"hnuser\">tashoecraft</a> <span class=\"age\"><a href=\"item?id=20038852\">11 hours ago</a></span> <span id=\"unv_20038852\"></span> | <a href=\"hide?id=20038852&amp;goto=news\">hide</a> | <a href=\"item?id=20038852\">184&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20045791'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">26.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20045791' href='vote?id=20045791&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.thedrive.com/the-war-zone/28282/u-s-says-russia-might-be-setting-off-very-low-yield-nuclear-weapons-on-this-arctic-island\" class=\"storylink\">U.S. Says Russia Might Be Setting Off Low-Yield Nuclear Weapons</a><span class=\"sitebit comhead\"> (<a href=\"from?site=thedrive.com\"><span class=\"sitestr\">thedrive.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20045791\">14 points</span> by <a href=\"user?id=tomohawk\" class=\"hnuser\">tomohawk</a> <span class=\"age\"><a href=\"item?id=20045791\">1 hour ago</a></span> <span id=\"unv_20045791\"></span> | <a href=\"hide?id=20045791&amp;goto=news\">hide</a> | <a href=\"item?id=20045791\">discuss</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20042147'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">27.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20042147' href='vote?id=20042147&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://russ.app/2019/05/delisted-overnight\" class=\"storylink\">Delisted Overnight: A Cautionary Tale for Indie iOS Developers</a><span class=\"sitebit comhead\"> (<a href=\"from?site=russ.app\"><span class=\"sitestr\">russ.app</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20042147\">179 points</span> by <a href=\"user?id=rooster8\" class=\"hnuser\">rooster8</a> <span class=\"age\"><a href=\"item?id=20042147\">6 hours ago</a></span> <span id=\"unv_20042147\"></span> | <a href=\"hide?id=20042147&amp;goto=news\">hide</a> | <a href=\"item?id=20042147\">98&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20038374'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">28.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20038374' href='vote?id=20038374&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://genode.org/documentation/release-notes/19.05\" class=\"storylink\">Genode OS Framework 19.05</a><span class=\"sitebit comhead\"> (<a href=\"from?site=genode.org\"><span class=\"sitestr\">genode.org</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20038374\">71 points</span> by <a href=\"user?id=snvzz\" class=\"hnuser\">snvzz</a> <span class=\"age\"><a href=\"item?id=20038374\">10 hours ago</a></span> <span id=\"unv_20038374\"></span> | <a href=\"hide?id=20038374&amp;goto=news\">hide</a> | <a href=\"item?id=20038374\">17&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20039863'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">29.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20039863' href='vote?id=20039863&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.bloomberg.com/news/articles/2019-05-28/buyer-s-remorse-high-debt-and-low-pay-leave-some-grads-rueful\" class=\"storylink\">Buyer's Remorse: High Debt and Low Pay Leave Some College Grads Rueful</a><span class=\"sitebit comhead\"> (<a href=\"from?site=bloomberg.com\"><span class=\"sitestr\">bloomberg.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20039863\">220 points</span> by <a href=\"user?id=pseudolus\" class=\"hnuser\">pseudolus</a> <span class=\"age\"><a href=\"item?id=20039863\">9 hours ago</a></span> <span id=\"unv_20039863\"></span> | <a href=\"hide?id=20039863&amp;goto=news\">hide</a> | <a href=\"item?id=20039863\">412&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='20039892'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">30.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_20039892' href='vote?id=20039892&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://onezero.medium.com/id-at-the-door-meet-the-security-company-building-an-international-database-of-banned-bar-patrons-7c6d4b236fc3\" class=\"storylink\">Security Company Building an International Database of Banned Bar Patrons</a><span class=\"sitebit comhead\"> (<a href=\"from?site=onezero.medium.com\"><span class=\"sitestr\">onezero.medium.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_20039892\">94 points</span> by <a href=\"user?id=jbegley\" class=\"hnuser\">jbegley</a> <span class=\"age\"><a href=\"item?id=20039892\">8 hours ago</a></span> <span id=\"unv_20039892\"></span> | <a href=\"hide?id=20039892&amp;goto=news\">hide</a> | <a href=\"item?id=20039892\">115&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "            <tr class=\"morespace\" style=\"height:10px\"></tr><tr><td colspan=\"2\"></td><td class=\"title\"><a href=\"news?p=2\" class=\"morelink\" rel=\"next\">More</a></td></tr>\n",
      "  </table>\n",
      "</td></tr>\n",
      "<tr><td><img src=\"s.gif\" height=\"10\" width=\"0\"><table width=\"100%\" cellspacing=\"0\" cellpadding=\"1\"><tr><td bgcolor=\"#ff6600\"></td></tr></table><br><center><span class=\"yclinks\"><a href=\"newsguidelines.html\">Guidelines</a>\n",
      "        | <a href=\"newsfaq.html\">FAQ</a>\n",
      "        | <a href=\"mailto:hn@ycombinator.com\">Support</a>\n",
      "        | <a href=\"https://github.com/HackerNews/API\">API</a>\n",
      "        | <a href=\"security.html\">Security</a>\n",
      "        | <a href=\"lists\">Lists</a>\n",
      "        | <a href=\"bookmarklet.html\" rel=\"nofollow\">Bookmarklet</a>\n",
      "        | <a href=\"http://www.ycombinator.com/legal/\">Legal</a>\n",
      "        | <a href=\"http://www.ycombinator.com/apply/\">Apply to YC</a>\n",
      "        | <a href=\"mailto:hn@ycombinator.com\">Contact</a></span><br><br><form method=\"get\" action=\"//hn.algolia.com/\">Search:\n",
      "          <input type=\"text\" name=\"q\" value=\"\" size=\"17\" autocorrect=\"off\" spellcheck=\"false\" autocapitalize=\"off\" autocomplete=\"false\"></form>\n",
      "            </center></td></tr>\n",
      "      </table></center></body><script type='text/javascript' src='hn.js?xDJZ1aWhiD4MZBrpGsuq'></script>\n",
      "  </html>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "# Fetch a web page\n",
    "r = requests.get(\"https://news.ycombinator.com\")\n",
    "print(r.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "            \n",
      "          \n",
      "        Hacker News\n",
      "        \n",
      "                  Hacker News\n",
      "              new | past | comments | ask | show | jobs | submit            \n",
      "                              login\n",
      "                          \n",
      "              \n",
      "\n",
      "              \n",
      "      1.      Advanced Data Structures (mit.edu)\n",
      "        315 points by rjammala 2 hours ago  | hide | 27&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      2.      Google to restrict modern ad blocking Chrome extensions to enterprise users (9to5google.com)\n",
      "        221 points by estranhosidade 3 hours ago  | hide | 119&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      3.      Roc – Real-Time streaming over the network (gavv.github.io)\n",
      "        33 points by gavv42 1 hour ago  | hide | 9&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      4.      Exponential economist meets finite physicist (ucsd.edu)\n",
      "        42 points by chepaslaaa 2 hours ago  | hide | 26&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      5.      0x Launch Kit – Launch your own cryptocurrency exchange or marketplace (0x.org)\n",
      "        296 points by tomhschmidt 6 hours ago  | hide | 115&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      6.      Falsehoods Programmers Believe About Search (opensourceconnections.com)\n",
      "        228 points by binarymax 9 hours ago  | hide | 138&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      7.      The Boring Company will develop an underground “people mover” for Las Vegas (citylab.com)\n",
      "        150 points by cienega 5 hours ago  | hide | 259&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      8.      Complex life may only exist because of millions of years of groundwork by fungi (theconversation.com)\n",
      "        131 points by pradpk 9 hours ago  | hide | 42&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      9.      A website that runs on a solar-powered server in Barcelona (lowtechmagazine.com)\n",
      "        279 points by peey 12 hours ago  | hide | 86&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      10.      Show HN: PugSQL, a Python Port of HugSQL (pugsql.org)\n",
      "        114 points by mcfunley 8 hours ago  | hide | 39&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      11.      A way to do atomic writes (lwn.net)\n",
      "        89 points by Tomte 8 hours ago  | hide | 34&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      12.      An exoplanet has been found in the ‘Neptunian Desert’ (phys.org)\n",
      "        130 points by lelf 10 hours ago  | hide | 54&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      13.      DRM enabled Google to have an open source browser still under its control (boingboing.net)\n",
      "        67 points by phowat 2 hours ago  | hide | 37&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      14.      What’s the Magic Behind Graphene’s ‘Magic’ Angle? (quantamagazine.org)\n",
      "        128 points by pseudolus 10 hours ago  | hide | 16&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      15.      Building Facebook's Service Encryption Infastructure (fb.com)\n",
      "        129 points by sudoyear123 9 hours ago  | hide | 36&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      16.      Open-sourcing the Lightning Web Components framework (salesforce.com)\n",
      "        113 points by kungfudoi 8 hours ago  | hide | 65&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      17.      Should Failing Phish Tests Be a Fireable Offense? (krebsonsecurity.com)\n",
      "        99 points by headalgorithm 5 hours ago  | hide | 212&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      18.      Hack on Bitcoin in SF at Sparkswap (YC S18) (keyvalues.com)\n",
      "        1 hour ago | hide      \n",
      "      \n",
      "                \n",
      "      19.      Outschooling (avc.com)\n",
      "        78 points by ctulek 7 hours ago  | hide | 99&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      20.      TypeScript 3.5 (microsoft.com)\n",
      "        196 points by DanRosenwasser 5 hours ago  | hide | 43&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      21.      Plaid Launches in the UK (plaid.com)\n",
      "        129 points by jessedhillon 9 hours ago  | hide | 67&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      22.      Uber will start deactivating riders with low ratings (techcrunch.com)\n",
      "        188 points by hbcondo714 7 hours ago  | hide | 487&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      23.      Apple App Store Principles and Practices (apple.com)\n",
      "        136 points by tosh 8 hours ago  | hide | 233&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      24.      Long Before Boeing 737 Max Crash, Ethiopian Air Pilot Warned of Dangers (bloomberg.com)\n",
      "        176 points by pseudolus 10 hours ago  | hide | 108&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      25.      Angular v8.0 (github.com)\n",
      "        241 points by tashoecraft 11 hours ago  | hide | 184&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      26.      U.S. Says Russia Might Be Setting Off Low-Yield Nuclear Weapons (thedrive.com)\n",
      "        14 points by tomohawk 1 hour ago  | hide | discuss              \n",
      "      \n",
      "                \n",
      "      27.      Delisted Overnight: A Cautionary Tale for Indie iOS Developers (russ.app)\n",
      "        179 points by rooster8 6 hours ago  | hide | 98&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      28.      Genode OS Framework 19.05 (genode.org)\n",
      "        71 points by snvzz 10 hours ago  | hide | 17&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      29.      Buyer's Remorse: High Debt and Low Pay Leave Some College Grads Rueful (bloomberg.com)\n",
      "        220 points by pseudolus 9 hours ago  | hide | 412&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      30.      Security Company Building an International Database of Banned Bar Patrons (onezero.medium.com)\n",
      "        94 points by jbegley 8 hours ago  | hide | 115&nbsp;comments              \n",
      "      \n",
      "            More\n",
      "  \n",
      "\n",
      "Guidelines\n",
      "        | FAQ\n",
      "        | Support\n",
      "        | API\n",
      "        | Security\n",
      "        | Lists\n",
      "        | Bookmarklet\n",
      "        | Legal\n",
      "        | Apply to YC\n",
      "        | ContactSearch:\n",
      "          \n",
      "            \n",
      "      \n",
      "  \n",
      "\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "# Remove HTML tags using RegEx\n",
    "pattern = re.compile(r'<.*?>')  # tags look like <...>\n",
    "print(pattern.sub('', r.text))  # replace them with blank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "            \n",
      "          \n",
      "        Hacker News\n",
      "        \n",
      "                  Hacker News\n",
      "              new | past | comments | ask | show | jobs | submit            \n",
      "                              login\n",
      "                          \n",
      "              \n",
      "\n",
      "              \n",
      "      1.      Advanced Data Structures (mit.edu)\n",
      "        315 points by rjammala 2 hours ago  | hide | 27 comments              \n",
      "      \n",
      "                \n",
      "      2.      Google to restrict modern ad blocking Chrome extensions to enterprise users (9to5google.com)\n",
      "        221 points by estranhosidade 3 hours ago  | hide | 119 comments              \n",
      "      \n",
      "                \n",
      "      3.      Roc – Real-Time streaming over the network (gavv.github.io)\n",
      "        33 points by gavv42 1 hour ago  | hide | 9 comments              \n",
      "      \n",
      "                \n",
      "      4.      Exponential economist meets finite physicist (ucsd.edu)\n",
      "        42 points by chepaslaaa 2 hours ago  | hide | 26 comments              \n",
      "      \n",
      "                \n",
      "      5.      0x Launch Kit – Launch your own cryptocurrency exchange or marketplace (0x.org)\n",
      "        296 points by tomhschmidt 6 hours ago  | hide | 115 comments              \n",
      "      \n",
      "                \n",
      "      6.      Falsehoods Programmers Believe About Search (opensourceconnections.com)\n",
      "        228 points by binarymax 9 hours ago  | hide | 138 comments              \n",
      "      \n",
      "                \n",
      "      7.      The Boring Company will develop an underground “people mover” for Las Vegas (citylab.com)\n",
      "        150 points by cienega 5 hours ago  | hide | 259 comments              \n",
      "      \n",
      "                \n",
      "      8.      Complex life may only exist because of millions of years of groundwork by fungi (theconversation.com)\n",
      "        131 points by pradpk 9 hours ago  | hide | 42 comments              \n",
      "      \n",
      "                \n",
      "      9.      A website that runs on a solar-powered server in Barcelona (lowtechmagazine.com)\n",
      "        279 points by peey 12 hours ago  | hide | 86 comments              \n",
      "      \n",
      "                \n",
      "      10.      Show HN: PugSQL, a Python Port of HugSQL (pugsql.org)\n",
      "        114 points by mcfunley 8 hours ago  | hide | 39 comments              \n",
      "      \n",
      "                \n",
      "      11.      A way to do atomic writes (lwn.net)\n",
      "        89 points by Tomte 8 hours ago  | hide | 34 comments              \n",
      "      \n",
      "                \n",
      "      12.      An exoplanet has been found in the ‘Neptunian Desert’ (phys.org)\n",
      "        130 points by lelf 10 hours ago  | hide | 54 comments              \n",
      "      \n",
      "                \n",
      "      13.      DRM enabled Google to have an open source browser still under its control (boingboing.net)\n",
      "        67 points by phowat 2 hours ago  | hide | 37 comments              \n",
      "      \n",
      "                \n",
      "      14.      What’s the Magic Behind Graphene’s ‘Magic’ Angle? (quantamagazine.org)\n",
      "        128 points by pseudolus 10 hours ago  | hide | 16 comments              \n",
      "      \n",
      "                \n",
      "      15.      Building Facebook's Service Encryption Infastructure (fb.com)\n",
      "        129 points by sudoyear123 9 hours ago  | hide | 36 comments              \n",
      "      \n",
      "                \n",
      "      16.      Open-sourcing the Lightning Web Components framework (salesforce.com)\n",
      "        113 points by kungfudoi 8 hours ago  | hide | 65 comments              \n",
      "      \n",
      "                \n",
      "      17.      Should Failing Phish Tests Be a Fireable Offense? (krebsonsecurity.com)\n",
      "        99 points by headalgorithm 5 hours ago  | hide | 212 comments              \n",
      "      \n",
      "                \n",
      "      18.      Hack on Bitcoin in SF at Sparkswap (YC S18) (keyvalues.com)\n",
      "        1 hour ago | hide      \n",
      "      \n",
      "                \n",
      "      19.      Outschooling (avc.com)\n",
      "        78 points by ctulek 7 hours ago  | hide | 99 comments              \n",
      "      \n",
      "                \n",
      "      20.      TypeScript 3.5 (microsoft.com)\n",
      "        196 points by DanRosenwasser 5 hours ago  | hide | 43 comments              \n",
      "      \n",
      "                \n",
      "      21.      Plaid Launches in the UK (plaid.com)\n",
      "        129 points by jessedhillon 9 hours ago  | hide | 67 comments              \n",
      "      \n",
      "                \n",
      "      22.      Uber will start deactivating riders with low ratings (techcrunch.com)\n",
      "        188 points by hbcondo714 7 hours ago  | hide | 487 comments              \n",
      "      \n",
      "                \n",
      "      23.      Apple App Store Principles and Practices (apple.com)\n",
      "        136 points by tosh 8 hours ago  | hide | 233 comments              \n",
      "      \n",
      "                \n",
      "      24.      Long Before Boeing 737 Max Crash, Ethiopian Air Pilot Warned of Dangers (bloomberg.com)\n",
      "        176 points by pseudolus 10 hours ago  | hide | 108 comments              \n",
      "      \n",
      "                \n",
      "      25.      Angular v8.0 (github.com)\n",
      "        241 points by tashoecraft 11 hours ago  | hide | 184 comments              \n",
      "      \n",
      "                \n",
      "      26.      U.S. Says Russia Might Be Setting Off Low-Yield Nuclear Weapons (thedrive.com)\n",
      "        14 points by tomohawk 1 hour ago  | hide | discuss              \n",
      "      \n",
      "                \n",
      "      27.      Delisted Overnight: A Cautionary Tale for Indie iOS Developers (russ.app)\n",
      "        179 points by rooster8 6 hours ago  | hide | 98 comments              \n",
      "      \n",
      "                \n",
      "      28.      Genode OS Framework 19.05 (genode.org)\n",
      "        71 points by snvzz 10 hours ago  | hide | 17 comments              \n",
      "      \n",
      "                \n",
      "      29.      Buyer's Remorse: High Debt and Low Pay Leave Some College Grads Rueful (bloomberg.com)\n",
      "        220 points by pseudolus 9 hours ago  | hide | 412 comments              \n",
      "      \n",
      "                \n",
      "      30.      Security Company Building an International Database of Banned Bar Patrons (onezero.medium.com)\n",
      "        94 points by jbegley 8 hours ago  | hide | 115 comments              \n",
      "      \n",
      "            More\n",
      "  \n",
      "\n",
      "Guidelines\n",
      "        | FAQ\n",
      "        | Support\n",
      "        | API\n",
      "        | Security\n",
      "        | Lists\n",
      "        | Bookmarklet\n",
      "        | Legal\n",
      "        | Apply to YC\n",
      "        | ContactSearch:\n",
      "          \n",
      "            \n",
      "      \n",
      "  \n",
      "\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "# Remove HTML tags using Beautiful Soup library\n",
    "soup = BeautifulSoup(r.text, \"html5lib\")\n",
    "print(soup.get_text())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tr class=\"athing\" id=\"20044876\">\n",
       "      <td align=\"right\" class=\"title\" valign=\"top\"><span class=\"rank\">1.</span></td>      <td class=\"votelinks\" valign=\"top\"><center><a href=\"vote?id=20044876&amp;how=up&amp;goto=news\" id=\"up_20044876\"><div class=\"votearrow\" title=\"upvote\"></div></a></center></td><td class=\"title\"><a class=\"storylink\" href=\"https://courses.csail.mit.edu/6.851/fall17/\">Advanced Data Structures</a><span class=\"sitebit comhead\"> (<a href=\"from?site=mit.edu\"><span class=\"sitestr\">mit.edu</span></a>)</span></td></tr>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Find all articles\n",
    "summaries = soup.find_all(\"tr\", class_=\"athing\")\n",
    "summaries[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "summaries[0].select_one(\"h3 a\").get_text().strip()\n",
    "summaries[0].select_one(\"div[data-course-short-summary]\").get_texxt().strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Advanced Data Structures'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Extract title\n",
    "summaries[0].find(\"a\", class_=\"storylink\").get_text().strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "30 Article summaries found. Sample:\n",
      "Advanced Data Structures\n"
     ]
    }
   ],
   "source": [
    "# Find all articles, extract titles\n",
    "articles = []\n",
    "summaries = soup.find_all(\"tr\", class_=\"athing\")\n",
    "for summary in summaries:\n",
    "    title = summary.find(\"a\", class_=\"storylink\").get_text().strip()\n",
    "    articles.append((title))\n",
    "\n",
    "print(len(articles), \"Article summaries found. Sample:\")\n",
    "print(articles[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Normalization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Case Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample text\n",
    "text = \"The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?\"\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to lowercase\n",
    "text = text.lower() \n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Punctuation Removal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Remove punctuation characters\n",
    "text = re.sub(r\"[^a-zA-Z0-9]\", \" \", text) \n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split text into tokens (words)\n",
    "words = text.split()\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NLTK: Natural Language ToolKit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\ckd16\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\ckd16\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Unzipping corpora\\stopwords.zip.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.data.path.append(os.path.join(os.getcwd(), \"nltk_data\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.\n"
     ]
    }
   ],
   "source": [
    "# Another sample text\n",
    "text = \"Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.\"\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']\n"
     ]
    }
   ],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "# Split text into words using NLTK\n",
    "words = word_tokenize(text)\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']\n"
     ]
    }
   ],
   "source": [
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "# Split text into sentences\n",
    "sentences = sent_tokenize(text)\n",
    "print(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
     ]
    }
   ],
   "source": [
    "# List stop words\n",
    "from nltk.corpus import stopwords\n",
    "print(stopwords.words(\"english\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reset text\n",
    "text = \"The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?\"\n",
    "\n",
    "# Normalize it\n",
    "text = re.sub(r\"[^a-zA-Z0-9]\", \" \", text.lower())\n",
    "\n",
    "# Tokenize it\n",
    "words = text.split()\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Dr.', 'Smith', 'graduated', 'University', 'Washington', '.', 'He', 'later', 'started', 'analytics', 'firm', 'called', 'Lux', ',', 'catered', 'enterprise', 'customers', '.']\n"
     ]
    }
   ],
   "source": [
    "# Remove stop words\n",
    "words = [w for w in words if w not in stopwords.words(\"english\")]\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sentence Parsing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "\n",
    "# Define a custom grammar\n",
    "my_grammar = nltk.CFG.fromstring(\"\"\"\n",
    "S -> NP VP\n",
    "PP -> P NP\n",
    "NP -> Det N | Det N PP | 'I'\n",
    "VP -> V NP | VP PP\n",
    "Det -> 'an' | 'my'\n",
    "N -> 'elephant' | 'pajamas'\n",
    "V -> 'shot'\n",
    "P -> 'in'\n",
    "\"\"\")\n",
    "parser = nltk.ChartParser(my_grammar)\n",
    "\n",
    "# Parse a sentence\n",
    "sentence = word_tokenize(\"I shot an elephant in my pajamas\")\n",
    "for tree in parser.parse(sentence):\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP I)\n",
      "  (VP\n",
      "    (VP (V shot) (NP (Det an) (N elephant)))\n",
      "    (PP (P in) (NP (Det my) (N pajamas)))))\n",
      "(S\n",
      "  (NP I)\n",
      "  (VP\n",
      "    (V shot)\n",
      "    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\ckd16\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\ckd16\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     C:\\Users\\ckd16\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('I', 'PRP'),\n",
       " ('shot', 'VBP'),\n",
       " ('an', 'DT'),\n",
       " ('elephant', 'NN'),\n",
       " ('in', 'IN'),\n",
       " ('my', 'PRP$'),\n",
       " ('pajamas', 'NN')]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "nltk.data.path.append(os.path.join(os.getcwd(), \"nltk_data\"))\n",
    "from nltk import pos_tag\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "my_grammar = nltk.CFG.fromstring(\"\"\"\n",
    "S -> NP VP\n",
    "PP -> P NP\n",
    "NP -> Det N | Det N PP | 'I'\n",
    "VP -> V NP | VP PP\n",
    "Det -> 'an' | 'my'\n",
    "N -> 'elephant' | 'pajamas'\n",
    "V -> 'shot'\n",
    "P -> 'in'\n",
    "\"\"\")\n",
    "parser = nltk.ChartParser(my_grammar)\n",
    "\n",
    "sentence = word_tokenize(\"I shot an elephant in my pajamas\")\n",
    "\n",
    "for tree in parser.parse(sentence):\n",
    "    print (tree)\n",
    "#     tree.draw()\n",
    "\n",
    "pos_tag(sentence)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stemming & Lemmatization\n",
    "\n",
    "### Stemming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem.porter import PorterStemmer\n",
    "\n",
    "# Reduce words to their stems\n",
    "stemmed = [PorterStemmer().stem(w) for w in words]\n",
    "print(stemmed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lemmatization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "\n",
    "# Reduce words to their root form\n",
    "lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]\n",
    "print(lemmed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lemmatize verbs by specifying pos\n",
    "lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]\n",
    "print(lemmed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
